In [1]:
#installing the necessary packages if not pre-installed
# !pip install pandas
# !pip install plotly
# !pip install seaborn
# !pip install matplotlib
# !pip install numpy
# !pip install nbformat
# !pip install chart_studio
# !pip install matplotlib-colorbar
# !pip install category_encoders
In [2]:
#importing the necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

Read the csv file¶

In [3]:
#importing the e-commerce dataset into a dataframe
ecd = pd.read_csv('E-Commerce Churn Data.csv')
ecd
Out[3]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered PreferedOrderCat SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount
0 50001 1 4.0 Mobile Phone 3 6.0 Debit Card Female 3.0 3 Laptop & Accessory 2 Single 9 1 11.0 1.0 1.0 5.0 160
1 50002 1 NaN Phone 1 8.0 UPI Male 3.0 4 Mobile 3 Single 7 1 15.0 0.0 1.0 0.0 121
2 50003 1 NaN Phone 1 30.0 Debit Card Male 2.0 4 Mobile 3 Single 6 1 14.0 0.0 1.0 3.0 120
3 50004 1 0.0 Phone 3 15.0 Debit Card Male 2.0 4 Laptop & Accessory 5 Single 8 0 23.0 0.0 1.0 3.0 134
4 50005 1 0.0 Phone 1 12.0 CC Male NaN 3 Mobile 5 Single 3 0 11.0 1.0 1.0 3.0 130
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5625 55626 0 10.0 Computer 1 30.0 Credit Card Male 3.0 2 Laptop & Accessory 1 Married 6 0 18.0 1.0 2.0 4.0 151
5626 55627 0 13.0 Mobile Phone 1 13.0 Credit Card Male 3.0 5 Fashion 5 Married 6 0 16.0 1.0 2.0 NaN 225
5627 55628 0 1.0 Mobile Phone 1 11.0 Debit Card Male 3.0 2 Laptop & Accessory 4 Married 3 1 21.0 1.0 2.0 4.0 186
5628 55629 0 23.0 Computer 3 9.0 Credit Card Male 4.0 5 Laptop & Accessory 4 Married 4 0 15.0 2.0 2.0 9.0 179
5629 55630 0 8.0 Mobile Phone 1 15.0 Credit Card Male 3.0 2 Laptop & Accessory 3 Married 4 0 13.0 2.0 2.0 3.0 169

5630 rows × 20 columns

In [4]:
#checking the column names
ecd.columns
Out[4]:
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
       'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')

Data Dictionary¶

Variable Description
CustomerID Unique customer ID
Churn Churn Flag (Yes = 1; No = 0)
Tenure Tenure of customer in organization (i.e. length of time using the service)
PreferredLoginDevice Preferred login device of customer
CityTier City tier
WarehouseToHome Distance in between warehouse to home of customer
PreferredPaymentMode Preferred payment method of customer
Gender Gender of customer
HourSpendOnApp Number of hours spent on mobile application or website
NumberOfDeviceRegistered Total number of devices registered to particular customer
PreferedOrderCat Preferred order category of customer in last month
SatisfactionScore Level of customer satisfaction with service
MaritalStatus Marital status of customer
NumberOfAddress Total number of addresses added on particular customer
Complain Any complaint has been raised in last month
OrderAmountHikeFromlastYear Percentage increase in order from last year
CouponUsed Total number of coupons used in last month
OrderCount Total number of orders placed in last month
DaySinceLastOrder Days since last order by customer
CashbackAmount Average cashback in last month

Exploring the dataset and Cleaning data¶

In [5]:
#checking the description of dataset
pd.options.display.float_format = '{:20,.2f}'.format
ecd.describe().T
Out[5]:
count mean std min 25% 50% 75% max
CustomerID 5,630.00 52,815.50 1,625.39 50,001.00 51,408.25 52,815.50 54,222.75 55,630.00
Churn 5,630.00 0.17 0.37 0.00 0.00 0.00 0.00 1.00
Tenure 5,366.00 10.19 8.56 0.00 2.00 9.00 16.00 61.00
CityTier 5,630.00 1.65 0.92 1.00 1.00 1.00 3.00 3.00
WarehouseToHome 5,379.00 15.64 8.53 5.00 9.00 14.00 20.00 127.00
HourSpendOnApp 5,375.00 2.93 0.72 0.00 2.00 3.00 3.00 5.00
NumberOfDeviceRegistered 5,630.00 3.69 1.02 1.00 3.00 4.00 4.00 6.00
SatisfactionScore 5,630.00 3.07 1.38 1.00 2.00 3.00 4.00 5.00
NumberOfAddress 5,630.00 4.21 2.58 1.00 2.00 3.00 6.00 22.00
Complain 5,630.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
OrderAmountHikeFromlastYear 5,365.00 15.71 3.68 11.00 13.00 15.00 18.00 26.00
CouponUsed 5,374.00 1.75 1.89 0.00 1.00 1.00 2.00 16.00
OrderCount 5,372.00 3.01 2.94 1.00 1.00 2.00 3.00 16.00
DaySinceLastOrder 5,323.00 4.54 3.65 0.00 2.00 3.00 7.00 46.00
CashbackAmount 5,630.00 177.22 49.19 0.00 146.00 163.00 196.00 325.00
In [6]:
#checking the first 10 records of the dataset
ecd.head()
Out[6]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered PreferedOrderCat SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount
0 50001 1 4.00 Mobile Phone 3 6.00 Debit Card Female 3.00 3 Laptop & Accessory 2 Single 9 1 11.00 1.00 1.00 5.00 160
1 50002 1 NaN Phone 1 8.00 UPI Male 3.00 4 Mobile 3 Single 7 1 15.00 0.00 1.00 0.00 121
2 50003 1 NaN Phone 1 30.00 Debit Card Male 2.00 4 Mobile 3 Single 6 1 14.00 0.00 1.00 3.00 120
3 50004 1 0.00 Phone 3 15.00 Debit Card Male 2.00 4 Laptop & Accessory 5 Single 8 0 23.00 0.00 1.00 3.00 134
4 50005 1 0.00 Phone 1 12.00 CC Male NaN 3 Mobile 5 Single 3 0 11.00 1.00 1.00 3.00 130
In [7]:
ecd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5366 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5379 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5375 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5365 non-null   float64
 16  CouponUsed                   5374 non-null   float64
 17  OrderCount                   5372 non-null   float64
 18  DaySinceLastOrder            5323 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB
In [8]:
#Detecting missing values
ecd.isnull().any()
Out[8]:
CustomerID                     False
Churn                          False
Tenure                          True
PreferredLoginDevice           False
CityTier                       False
WarehouseToHome                 True
PreferredPaymentMode           False
Gender                         False
HourSpendOnApp                  True
NumberOfDeviceRegistered       False
PreferedOrderCat               False
SatisfactionScore              False
MaritalStatus                  False
NumberOfAddress                False
Complain                       False
OrderAmountHikeFromlastYear     True
CouponUsed                      True
OrderCount                      True
DaySinceLastOrder               True
CashbackAmount                 False
dtype: bool
In [9]:
#counting the missing values
ecd.isna().sum()
Out[9]:
CustomerID                       0
Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64
In [10]:
#plotting the missing values
ecd.isna().sum().plot(kind='bar', color='Red')
plt.show()
No description has been provided for this image
In [11]:
#total number of missing values in our dataset
ecd.isna().sum().sum()
Out[11]:
1856
In [12]:
#creating a function to display the data type, percentage of missing values and number of unique values per column
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')
In [13]:
sniff_modified(ecd)
Out[13]:
data type percent missing No. unique unique values
CustomerID int64 0.00 5630 [50001, 50002, 50003, 50004, 50005, 50006, 500...
Complain int64 0.00 2 [1, 0]
NumberOfAddress int64 0.00 15 [9, 7, 6, 8, 3, 2, 4, 10, 1, 5, 19, 21, 11, 20...
SatisfactionScore int64 0.00 5 [2, 3, 5, 4, 1]
NumberOfDeviceRegistered int64 0.00 6 [3, 4, 5, 2, 1, 6]
Churn int64 0.00 2 [1, 0]
CityTier int64 0.00 3 [3, 1, 2]
CashbackAmount int64 0.00 220 [160, 121, 120, 134, 130, 139, 123, 127, 295, ...
WarehouseToHome float64 4.46 35 [6.0, 8.0, 30.0, 15.0, 12.0, 22.0, 11.0, 9.0, ...
HourSpendOnApp float64 4.53 7 [3.0, 2.0, nan, 1.0, 0.0, 4.0, 5.0]
DaySinceLastOrder float64 5.45 23 [5.0, 0.0, 3.0, 7.0, 2.0, 1.0, 8.0, 6.0, 4.0, ...
Tenure float64 4.69 37 [4.0, nan, 0.0, 13.0, 11.0, 9.0, 19.0, 20.0, 1...
OrderAmountHikeFromlastYear float64 4.71 17 [11.0, 15.0, 14.0, 23.0, 22.0, 16.0, 12.0, nan...
CouponUsed float64 4.55 18 [1.0, 0.0, 4.0, 2.0, 9.0, 6.0, 11.0, nan, 7.0,...
OrderCount float64 4.58 17 [1.0, 6.0, 2.0, 15.0, 4.0, 7.0, 3.0, 9.0, nan,...
PreferredPaymentMode object 0.00 7 [Debit Card, UPI, CC, Cash on Delivery, E wall...
Gender object 0.00 2 [Female, Male]
PreferedOrderCat object 0.00 6 [Laptop & Accessory, Mobile, Mobile Phone, Oth...
PreferredLoginDevice object 0.00 3 [Mobile Phone, Phone, Computer]
MaritalStatus object 0.00 3 [Single, Divorced, Married]
In [14]:
#creating a function to find the columns with missing values, extract the number and percentage of these missing values in relation to the dataset

def FindMissingColsPercentage(df):

    total = 0
    for col in df.columns:
        missing_vals = df[col].isnull().sum()
        # mean = sum / total
        pct = df[col].isnull().mean() * 100 
        if missing_vals != 0:
          print('{} => {} [{}%]'.format(col, df[col].isnull().sum(), round(pct, 2)))
        total += missing_vals
    
    if total == 0:
        print("no missing values")
In [15]:
FindMissingColsPercentage(ecd)
Tenure => 264 [4.69%]
WarehouseToHome => 251 [4.46%]
HourSpendOnApp => 255 [4.53%]
OrderAmountHikeFromlastYear => 265 [4.71%]
CouponUsed => 256 [4.55%]
OrderCount => 258 [4.58%]
DaySinceLastOrder => 307 [5.45%]
In [16]:
#replacing all empty spaces with np.NaN
ecd_clean = ecd.replace(" ", np.NaN)
In [17]:
# replacing all missing values(NaN) in the dataset with 0
ecd_clean = ecd_clean.fillna(0)
In [18]:
#checking dataset information after replacing the missing values
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   5630 non-null   int64  
 1   Churn                        5630 non-null   int64  
 2   Tenure                       5630 non-null   float64
 3   PreferredLoginDevice         5630 non-null   object 
 4   CityTier                     5630 non-null   int64  
 5   WarehouseToHome              5630 non-null   float64
 6   PreferredPaymentMode         5630 non-null   object 
 7   Gender                       5630 non-null   object 
 8   HourSpendOnApp               5630 non-null   float64
 9   NumberOfDeviceRegistered     5630 non-null   int64  
 10  PreferedOrderCat             5630 non-null   object 
 11  SatisfactionScore            5630 non-null   int64  
 12  MaritalStatus                5630 non-null   object 
 13  NumberOfAddress              5630 non-null   int64  
 14  Complain                     5630 non-null   int64  
 15  OrderAmountHikeFromlastYear  5630 non-null   float64
 16  CouponUsed                   5630 non-null   float64
 17  OrderCount                   5630 non-null   float64
 18  DaySinceLastOrder            5630 non-null   float64
 19  CashbackAmount               5630 non-null   int64  
dtypes: float64(7), int64(8), object(5)
memory usage: 879.8+ KB
In [19]:
ecd_clean.isnull().any()
Out[19]:
CustomerID                     False
Churn                          False
Tenure                         False
PreferredLoginDevice           False
CityTier                       False
WarehouseToHome                False
PreferredPaymentMode           False
Gender                         False
HourSpendOnApp                 False
NumberOfDeviceRegistered       False
PreferedOrderCat               False
SatisfactionScore              False
MaritalStatus                  False
NumberOfAddress                False
Complain                       False
OrderAmountHikeFromlastYear    False
CouponUsed                     False
OrderCount                     False
DaySinceLastOrder              False
CashbackAmount                 False
dtype: bool
In [20]:
FindMissingColsPercentage(ecd_clean)
no missing values
In [21]:
#viewing the object data type columns
ecd_clean[['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']]
Out[21]:
PreferredPaymentMode Gender PreferedOrderCat PreferredLoginDevice MaritalStatus
0 Debit Card Female Laptop & Accessory Mobile Phone Single
1 UPI Male Mobile Phone Single
2 Debit Card Male Mobile Phone Single
3 Debit Card Male Laptop & Accessory Phone Single
4 CC Male Mobile Phone Single
... ... ... ... ... ...
5625 Credit Card Male Laptop & Accessory Computer Married
5626 Credit Card Male Fashion Mobile Phone Married
5627 Debit Card Male Laptop & Accessory Mobile Phone Married
5628 Credit Card Male Laptop & Accessory Computer Married
5629 Credit Card Male Laptop & Accessory Mobile Phone Married

5630 rows × 5 columns

In [22]:
#checking the unique values in these columns
obj = ['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']  #creating a list to the column names

for i in obj: #creating a for loop to print out the column name and unique values and count
    #print(i, ecd_clean[i].nunique(), '\n',ecd_clean[i].unique())
    print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode
Debit Card          2314
Credit Card         1501
E wallet             614
UPI                  414
COD                  365
CC                   273
Cash on Delivery     149
Name: count, dtype: int64 

Gender
Male      3384
Female    2246
Name: count, dtype: int64 

PreferedOrderCat
Laptop & Accessory    2050
Mobile Phone          1271
Fashion                826
Mobile                 809
Grocery                410
Others                 264
Name: count, dtype: int64 

PreferredLoginDevice
Mobile Phone    2765
Computer        1634
Phone           1231
Name: count, dtype: int64 

MaritalStatus
Married     2986
Single      1796
Divorced     848
Name: count, dtype: int64 

In [23]:
#replacing the data entries that has the same meaning
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('CC', 'Credit Card')
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('COD', 'Cash on Delivery')
ecd_clean['PreferedOrderCat'] = ecd_clean['PreferedOrderCat'].replace('Mobile', 'Mobile Phone')
ecd_clean['PreferredLoginDevice'] = ecd_clean['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')
#cross-checking the replacement has been effected
for i in obj: #creating a for loop to print out the column name and unique values and count
    print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode
Debit Card          2314
Credit Card         1774
E wallet             614
Cash on Delivery     514
UPI                  414
Name: count, dtype: int64 

Gender
Male      3384
Female    2246
Name: count, dtype: int64 

PreferedOrderCat
Mobile Phone          2080
Laptop & Accessory    2050
Fashion                826
Grocery                410
Others                 264
Name: count, dtype: int64 

PreferredLoginDevice
Mobile Phone    3996
Computer        1634
Name: count, dtype: int64 

MaritalStatus
Married     2986
Single      1796
Divorced     848
Name: count, dtype: int64 

In [24]:
#creating categories from tenure column into a new column - tenuregroup
ranges = [0,10,20,30,40,50,60,np.inf] #list to hold the bin ranges
group_names = ['0-10 years', '11-20 years', '21-30 years', '31-40 years', '41-50 years', '51-60 years', '61 years & above'] # list to hold the labels
ecd_clean['TenureGroup'] = pd.cut(ecd_clean['Tenure'], bins = ranges, labels = group_names, include_lowest = True)
ecd_clean[['Tenure', 'TenureGroup']]
Out[24]:
Tenure TenureGroup
0 4.00 0-10 years
1 0.00 0-10 years
2 0.00 0-10 years
3 0.00 0-10 years
4 0.00 0-10 years
... ... ...
5625 10.00 0-10 years
5626 13.00 11-20 years
5627 1.00 0-10 years
5628 23.00 21-30 years
5629 8.00 0-10 years

5630 rows × 2 columns

In [25]:
#checking the final clean data
ecd_clean.head(20), ecd_clean.shape
Out[25]:
(    CustomerID  Churn               Tenure PreferredLoginDevice  CityTier  \
 0        50001      1                 4.00         Mobile Phone         3   
 1        50002      1                 0.00         Mobile Phone         1   
 2        50003      1                 0.00         Mobile Phone         1   
 3        50004      1                 0.00         Mobile Phone         3   
 4        50005      1                 0.00         Mobile Phone         1   
 5        50006      1                 0.00             Computer         1   
 6        50007      1                 0.00         Mobile Phone         3   
 7        50008      1                 0.00         Mobile Phone         1   
 8        50009      1                13.00         Mobile Phone         3   
 9        50010      1                 0.00         Mobile Phone         1   
 10       50011      1                 4.00         Mobile Phone         1   
 11       50012      1                11.00         Mobile Phone         1   
 12       50013      1                 0.00         Mobile Phone         1   
 13       50014      1                 0.00         Mobile Phone         1   
 14       50015      1                 9.00         Mobile Phone         3   
 15       50016      1                 0.00         Mobile Phone         2   
 16       50017      1                 0.00             Computer         1   
 17       50018      1                 0.00         Mobile Phone         3   
 18       50019      1                 0.00             Computer         1   
 19       50020      1                19.00         Mobile Phone         1   
 
         WarehouseToHome PreferredPaymentMode  Gender       HourSpendOnApp  \
 0                  6.00           Debit Card  Female                 3.00   
 1                  8.00                  UPI    Male                 3.00   
 2                 30.00           Debit Card    Male                 2.00   
 3                 15.00           Debit Card    Male                 2.00   
 4                 12.00          Credit Card    Male                 0.00   
 5                 22.00           Debit Card  Female                 3.00   
 6                 11.00     Cash on Delivery    Male                 2.00   
 7                  6.00          Credit Card    Male                 3.00   
 8                  9.00             E wallet    Male                 0.00   
 9                 31.00           Debit Card    Male                 2.00   
 10                18.00     Cash on Delivery  Female                 2.00   
 11                 6.00           Debit Card    Male                 3.00   
 12                11.00     Cash on Delivery    Male                 2.00   
 13                15.00          Credit Card    Male                 3.00   
 14                15.00          Credit Card    Male                 3.00   
 15                12.00                  UPI    Male                 3.00   
 16                12.00           Debit Card  Female                 0.00   
 17                11.00             E wallet    Male                 2.00   
 18                13.00           Debit Card    Male                 3.00   
 19                20.00           Debit Card  Female                 3.00   
 
     NumberOfDeviceRegistered  ... SatisfactionScore  MaritalStatus  \
 0                          3  ...                 2         Single   
 1                          4  ...                 3         Single   
 2                          4  ...                 3         Single   
 3                          4  ...                 5         Single   
 4                          3  ...                 5         Single   
 5                          5  ...                 5         Single   
 6                          3  ...                 2       Divorced   
 7                          3  ...                 2       Divorced   
 8                          4  ...                 3       Divorced   
 9                          5  ...                 3         Single   
 10                         3  ...                 3       Divorced   
 11                         4  ...                 3         Single   
 12                         3  ...                 3         Single   
 13                         4  ...                 3       Divorced   
 14                         4  ...                 2         Single   
 15                         3  ...                 5        Married   
 16                         4  ...                 2         Single   
 17                         4  ...                 3         Single   
 18                         5  ...                 3         Single   
 19                         3  ...                 4       Divorced   
 
    NumberOfAddress  Complain  OrderAmountHikeFromlastYear  \
 0                9         1                        11.00   
 1                7         1                        15.00   
 2                6         1                        14.00   
 3                8         0                        23.00   
 4                3         0                        11.00   
 5                2         1                        22.00   
 6                4         0                        14.00   
 7                3         1                        16.00   
 8                2         1                        14.00   
 9                2         0                        12.00   
 10               2         0                         0.00   
 11              10         1                        13.00   
 12               2         1                        13.00   
 13               1         1                        17.00   
 14               2         0                        16.00   
 15               5         1                        22.00   
 16               2         1                        18.00   
 17               2         1                        11.00   
 18               2         1                        24.00   
 19              10         1                        18.00   
 
              CouponUsed           OrderCount    DaySinceLastOrder  \
 0                  1.00                 1.00                 5.00   
 1                  0.00                 1.00                 0.00   
 2                  0.00                 1.00                 3.00   
 3                  0.00                 1.00                 3.00   
 4                  1.00                 1.00                 3.00   
 5                  4.00                 6.00                 7.00   
 6                  0.00                 1.00                 0.00   
 7                  2.00                 2.00                 0.00   
 8                  0.00                 1.00                 2.00   
 9                  1.00                 1.00                 1.00   
 10                 9.00                15.00                 8.00   
 11                 0.00                 1.00                 0.00   
 12                 2.00                 2.00                 2.00   
 13                 0.00                 1.00                 0.00   
 14                 0.00                 4.00                 7.00   
 15                 1.00                 1.00                 2.00   
 16                 1.00                 1.00                 0.00   
 17                 1.00                 1.00                 3.00   
 18                 1.00                 1.00                 6.00   
 19                 1.00                 4.00                 3.00   
 
     CashbackAmount  TenureGroup  
 0              160   0-10 years  
 1              121   0-10 years  
 2              120   0-10 years  
 3              134   0-10 years  
 4              130   0-10 years  
 5              139   0-10 years  
 6              121   0-10 years  
 7              123   0-10 years  
 8              127  11-20 years  
 9              123   0-10 years  
 10             295   0-10 years  
 11             154  11-20 years  
 12             134   0-10 years  
 13             134   0-10 years  
 14             196   0-10 years  
 15             121   0-10 years  
 16             129   0-10 years  
 17             157   0-10 years  
 18             161   0-10 years  
 19             150  11-20 years  
 
 [20 rows x 21 columns],
 (5630, 21))

Exploratory Data Analysis¶

In [26]:
# Plotting graphs for better understanding of data distribution

cols = ['HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder']
ecd[cols].hist(figsize=(15,15), bins=15, color='Red')
Out[26]:
array([[<Axes: title={'center': 'HourSpendOnApp'}>,
        <Axes: title={'center': 'NumberOfDeviceRegistered'}>],
       [<Axes: title={'center': 'SatisfactionScore'}>,
        <Axes: title={'center': 'OrderAmountHikeFromlastYear'}>],
       [<Axes: title={'center': 'CouponUsed'}>,
        <Axes: title={'center': 'DaySinceLastOrder'}>]], dtype=object)
No description has been provided for this image
In [27]:
ecd_1 =  ecd_clean.copy() #creating a copy of the cleaned dataset
churn_labels = {0: "Customer Retained", 1: "Customer Churned"} #creating a dictionary for the churn labels
ecd_1['Churn'] = ecd_1['Churn'].replace(churn_labels) #replacing the binary numbers in the column to the new churn labels

#visualising customer churn rate
sns.countplot(x = 'Churn', data = ecd_1, palette=sns.color_palette('RdBu'), order=['Customer Retained', 'Customer Churned'])
plt.title("Churn Rate")
plt.show()
No description has been provided for this image

From the above plot, we can say more than 50% of the e-commerce company's customer base was retained while smaller portion stopped using their services.

In [28]:
#visualing the churn rate in percentages.

Cust_Churn_Count = (ecd_clean['Churn'] == 1).sum()
Cust_Retained_count = (ecd_clean['Churn'] == 0).sum()
 
# Defining the labels and sizes
labels = 'Customers Churned', 'Customers Retained'
sizes = [Cust_Churn_Count, Cust_Retained_count]
colors_given = ['#ff7f0e', '#66B3AA']

# Define the explode parameter (how much to pull apart the slices)
explode = (0.15, 0)  # Pull apart the first slice (Customer Left) by 15%
fig, ax = plt.subplots()

# Creating the pie chart
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, wedgeprops={'edgecolor': 'black'}, colors=colors_given, explode=explode)
ax.axis('equal')
legend_labels = ['Customers Churned', 'Customers Retained']

# Adding a legend
legend = ax.legend(legend_labels, title="", loc="center left", bbox_to_anchor=(1.1, 1))
legend.get_frame().set_edgecolor('black')

# Setting the title
plt.title('Customer Churn Analysis', fontweight='bold')
plt.show()
No description has been provided for this image

The pie chart above shows that 16.8% of the total customer based churned while 83.2% are still using the services of the e-commerce company

In [29]:
#visualising the percentage of customers that churned
ecd_clean['Churn'].value_counts().plot.pie(labels=['Customers who stayed', 'Customers who left'],
                                           autopct='%1.1f%%',shadow=True, colors=sns.color_palette('Reds'))
plt.title("Proportion of Customers churned")
plt.show()
No description has been provided for this image
In [30]:
#checking out churn rate by gender distribution using a pivot table 
ecd_clean.pivot_table(values="Tenure", index="Gender", columns="Churn",
                      aggfunc='sum')
Out[30]:
Churn 0 1
Gender
Female 22,011.00 946.00
Male 29,738.00 1,984.00
In [31]:
#visualing the gender distibution with churn rate
sns.catplot(x='Churn', data=ecd_1, hue='Gender', hue_order=['Male', 'Female'], 
            kind='count', palette=['red', 'darkred'],  # Set custom red shades
            order=['Customer Retained', 'Customer Churned'])

plt.title("Churn Rate by Gender")
plt.show()
No description has been provided for this image
In [32]:
sns.catplot(x='TenureGroup', data=ecd_clean, kind='count',palette=sns.color_palette('RdBu'))
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image

The count plot shows the distribution of customers segmented into different tenure groups. Most of the customers fall into the category of those that have been with the company for less than 10 years.

In [67]:
sns.catplot(x='TenureGroup', y='Churn', data=ecd_clean, kind='bar', palette=sns.color_palette('RdBu'))
plt.ylabel('Churn rate')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image

The above plot shows the mean of customer's that churn per catergory with a 95% confidence interval.

In [72]:
# Set context and font scale for better readability
sns.set_context("paper", font_scale=1.1)

# Create the plot using histplot
plt.figure(figsize=(10, 6))
sns.histplot(data=ecd_clean, x="DaySinceLastOrder", hue="Churn", multiple="stack", palette=["red", "green"], kde=False)

# Add titles and labels
plt.title('Analysis of Churn based on the Days Passed from Last Order Date')
plt.xlabel('Days Since Last Order')
plt.ylabel('Churn Rate')

# Show the legend with custom labels
plt.legend(title='Customer Status', labels=["Customers Retained", "Customers Churned"], loc='upper right')

# Display the plot
plt.show()
No description has been provided for this image
In [68]:
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(ecd.DaySinceLastOrder[(ecd_clean["Churn"] == 0) ],
                color="Red", shade = True);
ax = sns.kdeplot(ecd_clean.DaySinceLastOrder[(ecd_clean["Churn"] == 1) ],
                ax =ax, color="green", shade= True);
ax.legend(["Customers Retained","Customers Churned"],loc='upper right');
ax.set_ylabel('Churn Rate');
ax.set_xlabel('DaysSinceLastOrder');
ax.set_title('Analysis of Churn based on the days passed from Last Order Date');
No description has been provided for this image
In [35]:
palette = sns.color_palette('tab10')
sns.lineplot(
    data=ecd, x='OrderCount',  y="Churn", 
    hue="CityTier", ci=None,
    palette=palette
)

#CityTier
Out[35]:
<Axes: xlabel='OrderCount', ylabel='Churn'>
No description has been provided for this image

We see that for higher orders places, due to multiple reasons which could be operations, or availability of stock at warehouses , there is a high churn in tier. 1 and tier 3. Cities have the highest churn and it’s the lowers in tier 2

In [82]:
# Create a custom color palette for churn categories
palette = ["#FF6347", "#FFA07A"]  # Red for churned, Orange for retained

# Create the violin plot for Churn by Satisfaction Score
plt.figure(figsize=(10, 6))
sns.violinplot(data=ecd_1, x='Churn', y='SatisfactionScore', palette=palette)

# Add titles and labels
plt.title('Churn Distribution by Satisfaction Rate')
plt.xlabel('Churn')
plt.ylabel('Satisfaction Score')

# Display the plot
plt.show()
No description has been provided for this image
In [65]:
order = ['Customer Retained', 'Customer Churned']

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='Complain', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='SatisfactionScore', hue='Churn', ax=axes[1], hue_order=order)
#sns.countplot(data=ecd, x='PaymentMethod', hue='Churn', ax=axes[1, 0])
#sns.countplot(data=ecd, x='TechSupport', hue='Churn', ax=axes[1, 1])
plt.tight_layout()
No description has been provided for this image

About 50% of the customers that complained about the customer's services actually churned afterwards. Even though, customers provided a high satisfaction score with the service, some of these customers still left.

In [80]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate churn distribution by Preferred Order Category
churn_distribution = ecd_1.groupby(['PreferedOrderCat', 'Churn']).size().reset_index(name='Count')

# Pivot the data to make it easier to plot
churn_pivot = churn_distribution.pivot_table(index='PreferedOrderCat', columns='Churn', values='Count', aggfunc='sum').fillna(0)

# Plot a pie chart for each category in PreferedOrderCat
fig, axes = plt.subplots(1, len(churn_pivot), figsize=(15, 6))

# Define colors for Churned and Retained
colors = ['#FF6347', '#FFA07A']  # Red for churned, Orange for retained

for idx, category in enumerate(churn_pivot.index):
    # Plot pie chart for each 'PreferedOrderCat'
    axes[idx].pie(churn_pivot.loc[category], labels=["Churned", "Retained"], colors=colors, autopct='%1.1f%%', startangle=90)
    axes[idx].set_title(f"Churn Distribution - {category}")

# Adjust the layout
plt.tight_layout()

# Display the plot
plt.show()
No description has been provided for this image
In [60]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='CityTier', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='PreferedOrderCat', hue='Churn', ax=axes[1], 
              order=['Laptop & Accessory', 'Mobile Phone', 'Fashion', 'Grocey', 'Others'],
              hue_order = order)
plt.tight_layout()
No description has been provided for this image

Customers in CityTier 2 had the lowest churn rate in relation to Tier 1 and 3. Also in terms of prefered order catergoey of customers, those in the Mobile Phone catergory had the highest churn rate followed by Laptop & Accessory

In [38]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.countplot(data=ecd_1, x='HourSpendOnApp', hue='Churn', ax=axes[0], hue_order=order)
sns.countplot(data=ecd_1, x='NumberOfDeviceRegistered', hue='Churn', ax=axes[1], hue_order=order)
plt.tight_layout()
No description has been provided for this image

Those customer that spent an average of 3 hours on the device and had an average of 4 registered devices, had the highest churn rate.

In [39]:
#subsetting the dataset to determine the churn characteristics of customer that registered a complaint with the company
count_of_complaints=ecd_clean.groupby(['Complain','Churn']).apply(lambda x:x['Churn'].count()).reset_index(name='No. of Customers')
count_of_complaints 
Out[39]:
Complain Churn No. of Customers
0 0 0 3586
1 0 1 440
2 1 0 1096
3 1 1 508
In [40]:
complain_values = [0, 1]
churn_0 = [3586, 1096]  # Customer Retained
churn_1 = [440, 508]    # Customer Churned

complain_labels = ['No Complaint', 'Complaint']
bar_width = 0.30

r1 = range(len(complain_values))
r2 = [x + bar_width for x in r1]

fig, ax = plt.subplots(figsize=(10, 6))

plt.bar(r1, churn_0, color='#66B3FF', width=bar_width, edgecolor='grey', label='Customers Stayed')
plt.bar(r2, churn_1, color='#FF4433', width=bar_width, edgecolor='grey', label='Customers Left')


for i in range(len(complain_values)):
    plt.text(r1[i], churn_0[i] + 20, str(churn_0[i]), ha='center', va='bottom', color='black', fontweight='bold')
    plt.text(r2[i], churn_1[i] + 50, str(churn_1[i]), ha='center', va='bottom', color='black', fontweight='bold')

plt.xlabel('Complain', fontweight='bold')
plt.xticks([r + bar_width/2 for r in range(len(complain_values))], complain_labels)

plt.ylabel('No. of Customers', fontweight='bold')

plt.title('Complaint counts of Customers in a Company', fontweight='bold')

plt.legend()

plt.show()
No description has been provided for this image

Model Training and Prediction¶

In [41]:
# Import all the libraries for machine learning models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,f1_score
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
In [42]:
ecd_clean.nunique(), ecd_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   CustomerID                   5630 non-null   int64   
 1   Churn                        5630 non-null   int64   
 2   Tenure                       5630 non-null   float64 
 3   PreferredLoginDevice         5630 non-null   object  
 4   CityTier                     5630 non-null   int64   
 5   WarehouseToHome              5630 non-null   float64 
 6   PreferredPaymentMode         5630 non-null   object  
 7   Gender                       5630 non-null   object  
 8   HourSpendOnApp               5630 non-null   float64 
 9   NumberOfDeviceRegistered     5630 non-null   int64   
 10  PreferedOrderCat             5630 non-null   object  
 11  SatisfactionScore            5630 non-null   int64   
 12  MaritalStatus                5630 non-null   object  
 13  NumberOfAddress              5630 non-null   int64   
 14  Complain                     5630 non-null   int64   
 15  OrderAmountHikeFromlastYear  5630 non-null   float64 
 16  CouponUsed                   5630 non-null   float64 
 17  OrderCount                   5630 non-null   float64 
 18  DaySinceLastOrder            5630 non-null   float64 
 19  CashbackAmount               5630 non-null   int64   
 20  TenureGroup                  5630 non-null   category
dtypes: category(1), float64(7), int64(8), object(5)
memory usage: 885.7+ KB
Out[42]:
(CustomerID                     5630
 Churn                             2
 Tenure                           36
 PreferredLoginDevice              2
 CityTier                          3
 WarehouseToHome                  35
 PreferredPaymentMode              5
 Gender                            2
 HourSpendOnApp                    6
 NumberOfDeviceRegistered          6
 PreferedOrderCat                  5
 SatisfactionScore                 5
 MaritalStatus                     3
 NumberOfAddress                  15
 Complain                          2
 OrderAmountHikeFromlastYear      17
 CouponUsed                       17
 OrderCount                       17
 DaySinceLastOrder                22
 CashbackAmount                  220
 TenureGroup                       7
 dtype: int64,
 None)
In [43]:
#converting the category dtype of TenureGroup column to numerical value
encoder = ce.OrdinalEncoder(mapping=[{'col': 'TenureGroup', 'mapping': {'0-10 years': 1, '11-20 years': 2, '21-30 years':3,
                                                                       '31-40 years': 4, '41-50 years': 5, '51-60 years': 6,
                                                                       '61 years & above': 7}}])

encoder.fit(ecd_clean)
ecd_clean = encoder.transform(ecd_clean)
ecd_clean.head()
Out[43]:
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered ... SatisfactionScore MaritalStatus NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed OrderCount DaySinceLastOrder CashbackAmount TenureGroup
0 50001 1 4.00 Mobile Phone 3 6.00 Debit Card Female 3.00 3 ... 2 Single 9 1 11.00 1.00 1.00 5.00 160 1
1 50002 1 0.00 Mobile Phone 1 8.00 UPI Male 3.00 4 ... 3 Single 7 1 15.00 0.00 1.00 0.00 121 1
2 50003 1 0.00 Mobile Phone 1 30.00 Debit Card Male 2.00 4 ... 3 Single 6 1 14.00 0.00 1.00 3.00 120 1
3 50004 1 0.00 Mobile Phone 3 15.00 Debit Card Male 2.00 4 ... 5 Single 8 0 23.00 0.00 1.00 3.00 134 1
4 50005 1 0.00 Mobile Phone 1 12.00 Credit Card Male 0.00 3 ... 5 Single 3 0 11.00 1.00 1.00 3.00 130 1

5 rows × 21 columns

In [44]:
ecd_clean['TenureGroup'].unique()
#importing libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
In [45]:
#defining coloumns (target column, needed for encoding later)
cust_id = ['CustomerID']
Target = ["Churn"]

#creating categorical list columns with unique values less than 7
cat_variable = ecd_clean.nunique()[ecd_clean.nunique() < 7].keys().tolist()
cat_variable = [x for x in cat_variable if x not in Target]

#creating numnerical list columns
numerical_vari = [x for x in ecd_clean if x not in cat_variable + Target + cust_id]

#separating columns with only 2 unique values as binary
binary_vari = ecd_clean.nunique()[ecd_clean.nunique() == 2].keys().tolist()

#separating rest into multiple
more_then_2_vari = [i for i in cat_variable if i not in binary_vari]
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
    ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
ecd_clean = pd.get_dummies(ecd_clean, columns=more_then_2_vari)
#standardizing numerical columns for preprocessing
scaler = StandardScaler()
numerical_cols = ["Tenure", "WarehouseToHome", "NumberOfAddress", "CashbackAmount"]
ecd_clean[numerical_cols] = scaler.fit_transform(ecd_clean[numerical_cols])

#summary
ecd_clean.describe().transpose()
Out[45]:
count mean std min 25% 50% 75% max
CustomerID 5,630.00 52,815.50 1,625.39 50,001.00 51,408.25 52,815.50 54,222.75 55,630.00
Churn 5,630.00 0.17 0.37 0.00 0.00 0.00 0.00 1.00
Tenure 5,630.00 0.00 1.00 -1.13 -1.01 -0.20 0.61 5.95
PreferredLoginDevice 5,630.00 0.71 0.45 0.00 0.00 1.00 1.00 1.00
WarehouseToHome 5,630.00 0.00 1.00 -1.67 -0.66 -0.22 0.57 12.53
Gender 5,630.00 0.60 0.49 0.00 0.00 1.00 1.00 1.00
NumberOfAddress 5,630.00 0.00 1.00 -1.24 -0.86 -0.47 0.69 6.88
Complain 5,630.00 0.28 0.45 0.00 0.00 0.00 1.00 1.00
OrderAmountHikeFromlastYear 5,630.00 14.97 4.89 0.00 13.00 14.00 18.00 26.00
CouponUsed 5,630.00 1.67 1.89 0.00 1.00 1.00 2.00 16.00
OrderCount 5,630.00 2.87 2.94 0.00 1.00 2.00 3.00 16.00
DaySinceLastOrder 5,630.00 4.30 3.70 0.00 1.00 3.00 7.00 46.00
CashbackAmount 5,630.00 0.00 1.00 -3.60 -0.63 -0.29 0.38 3.00
TenureGroup 5,630.00 1.55 0.75 1.00 1.00 1.00 2.00 7.00
In [46]:
#calculating coorelation
corr_ecd = ecd_clean.corr()
matrix_cols = corr_ecd.columns.tolist()
corr_array = np.array(corr_ecd)
corr_ecd
Out[46]:
CustomerID Churn Tenure PreferredLoginDevice WarehouseToHome Gender NumberOfAddress Complain OrderAmountHikeFromlastYear CouponUsed ... PreferedOrderCat_Mobile Phone PreferedOrderCat_Others SatisfactionScore_1 SatisfactionScore_2 SatisfactionScore_3 SatisfactionScore_4 SatisfactionScore_5 MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single
CustomerID 1.00 -0.02 0.08 -0.00 0.12 0.00 0.16 -0.01 0.03 0.24 ... -0.00 -0.01 0.14 -0.20 -0.00 0.01 0.01 -0.24 0.18 -0.01
Churn -0.02 1.00 -0.35 -0.05 0.03 0.03 0.04 0.25 0.04 0.01 ... 0.22 -0.05 -0.07 -0.04 0.01 0.00 0.09 -0.02 -0.15 0.18
Tenure 0.08 -0.35 1.00 0.04 0.02 -0.05 0.25 -0.02 -0.18 0.10 ... -0.37 0.27 0.01 -0.00 0.00 0.01 -0.03 0.04 0.08 -0.12
PreferredLoginDevice -0.00 -0.05 0.04 1.00 -0.02 -0.02 0.03 0.00 -0.04 -0.02 ... 0.01 0.05 -0.02 0.01 -0.04 0.01 0.04 0.03 -0.00 -0.02
WarehouseToHome 0.12 0.03 0.02 -0.02 1.00 -0.00 0.01 0.02 0.04 0.03 ... -0.14 -0.05 -0.00 -0.02 0.01 0.00 0.00 -0.00 0.04 -0.04
Gender 0.00 0.03 -0.05 -0.02 -0.00 1.00 -0.03 -0.04 -0.02 -0.03 ... 0.05 0.03 0.03 0.01 -0.01 -0.03 -0.01 -0.01 0.04 -0.03
NumberOfAddress 0.16 0.04 0.25 0.03 0.01 -0.03 1.00 -0.03 -0.04 0.03 ... -0.14 0.05 -0.04 -0.01 -0.01 0.03 0.03 -0.00 0.01 -0.01
Complain -0.01 0.25 -0.02 0.00 0.02 -0.04 -0.03 1.00 0.01 -0.01 ... 0.01 -0.01 0.04 0.00 -0.01 -0.04 0.01 0.01 -0.00 -0.00
OrderAmountHikeFromlastYear 0.03 0.04 -0.18 -0.04 0.04 -0.02 -0.04 0.01 1.00 -0.09 ... 0.13 -0.49 0.03 0.01 -0.03 -0.01 0.01 -0.03 -0.01 0.04
CouponUsed 0.24 0.01 0.10 -0.02 0.03 -0.03 0.03 -0.01 -0.09 1.00 ... -0.12 0.05 -0.01 0.01 -0.02 0.01 0.02 0.03 0.01 -0.03
OrderCount 0.13 -0.02 0.16 -0.01 0.02 -0.02 -0.03 -0.03 -0.13 0.59 ... -0.18 0.18 -0.00 -0.00 -0.01 -0.00 0.02 0.02 0.03 -0.05
DaySinceLastOrder 0.08 -0.15 0.18 0.00 0.05 -0.02 -0.07 -0.04 -0.12 0.25 ... -0.26 0.18 -0.01 -0.01 -0.02 -0.00 0.04 0.01 0.04 -0.05
CashbackAmount 0.22 -0.15 0.51 0.05 0.07 -0.03 0.19 0.00 -0.38 0.17 ... -0.58 0.57 -0.00 0.00 -0.00 0.00 0.00 0.04 0.05 -0.09
TenureGroup 0.05 -0.25 0.93 0.04 0.02 -0.04 0.23 0.00 -0.17 0.06 ... -0.30 0.27 -0.00 -0.01 0.02 0.01 -0.02 0.04 0.06 -0.10
CityTier_1 -0.01 -0.08 0.04 -0.00 -0.03 0.02 0.03 -0.00 0.00 -0.02 ... 0.23 0.03 -0.02 0.01 0.04 -0.04 0.01 -0.01 0.02 -0.01
CityTier_2 0.02 0.02 0.01 0.01 -0.01 0.04 0.01 -0.01 -0.05 -0.03 ... 0.07 0.06 0.00 -0.04 -0.03 0.07 -0.01 -0.01 0.05 -0.05
CityTier_3 -0.00 0.08 -0.05 -0.00 0.04 -0.03 -0.03 0.01 0.02 0.03 ... -0.27 -0.06 0.02 0.01 -0.03 0.01 -0.01 0.02 -0.04 0.03
PreferredPaymentMode_Cash on Delivery -0.00 0.07 -0.02 0.02 -0.02 0.01 -0.01 -0.02 -0.01 0.02 ... 0.07 -0.04 -0.03 -0.00 0.01 0.02 -0.00 0.00 -0.02 0.02
PreferredPaymentMode_Credit Card -0.00 -0.05 0.00 -0.00 0.01 -0.02 0.02 0.00 0.00 -0.02 ... 0.05 0.02 -0.02 -0.03 0.03 -0.01 0.01 -0.00 0.01 -0.01
PreferredPaymentMode_Debit Card 0.00 -0.03 0.02 -0.03 -0.03 0.00 -0.00 -0.01 0.02 -0.00 ... 0.01 -0.01 0.03 0.03 0.02 -0.05 -0.02 -0.01 -0.02 0.02
PreferredPaymentMode_E wallet -0.01 0.06 0.02 0.04 0.05 -0.02 -0.02 0.01 -0.02 0.01 ... -0.17 0.00 0.00 0.00 -0.06 0.06 0.01 0.02 0.00 -0.02
PreferredPaymentMode_UPI 0.01 0.00 -0.04 0.00 0.00 0.04 0.00 0.02 -0.01 -0.00 ... 0.02 0.03 0.01 -0.01 -0.03 0.03 0.01 -0.01 0.04 -0.03
HourSpendOnApp_0.0 -0.19 0.03 -0.10 -0.00 -0.01 0.02 -0.11 -0.01 0.02 -0.08 ... 0.25 -0.05 0.00 0.00 0.01 -0.00 -0.01 -0.01 0.01 -0.01
HourSpendOnApp_1.0 -0.06 -0.04 0.03 -0.03 -0.01 -0.02 -0.03 -0.01 0.02 -0.02 ... -0.03 -0.02 -0.00 0.00 0.00 -0.00 0.00 -0.00 -0.01 0.01
HourSpendOnApp_2.0 -0.49 -0.02 -0.00 -0.01 -0.09 0.02 -0.11 -0.00 -0.04 -0.15 ... -0.10 0.03 0.01 0.01 0.00 0.00 -0.03 -0.01 -0.03 0.04
HourSpendOnApp_3.0 0.16 0.01 0.01 0.01 0.02 -0.00 0.05 0.01 -0.01 0.04 ... -0.03 0.01 -0.00 0.00 0.00 0.00 0.00 0.00 -0.00 -0.00
HourSpendOnApp_4.0 0.45 -0.00 0.03 0.01 0.08 -0.02 0.11 0.00 0.04 0.16 ... 0.02 -0.02 -0.01 -0.01 -0.01 -0.00 0.03 0.01 0.03 -0.04
HourSpendOnApp_5.0 0.02 -0.01 0.01 0.01 0.01 -0.01 0.01 -0.01 0.01 0.03 ... -0.00 -0.01 0.01 -0.01 -0.02 0.01 0.01 -0.01 -0.01 0.02
NumberOfDeviceRegistered_1 -0.15 -0.04 -0.01 0.01 -0.03 0.02 -0.05 -0.00 0.01 -0.06 ... 0.02 -0.02 0.00 -0.02 0.01 -0.01 0.02 -0.03 0.05 -0.03
NumberOfDeviceRegistered_2 0.14 -0.05 0.02 0.01 0.03 0.02 0.03 -0.00 0.03 0.03 ... 0.01 -0.02 -0.01 0.00 -0.00 -0.01 0.02 0.00 0.03 -0.03
NumberOfDeviceRegistered_3 -0.55 -0.03 -0.03 -0.00 -0.07 0.00 -0.10 0.00 -0.03 -0.18 ... -0.01 0.01 0.01 0.00 -0.00 -0.00 -0.01 0.02 -0.01 -0.01
NumberOfDeviceRegistered_4 0.29 -0.01 0.04 0.00 0.03 -0.01 0.07 -0.00 -0.02 0.09 ... -0.00 0.00 -0.00 -0.00 -0.00 0.01 -0.01 0.00 -0.01 0.01
NumberOfDeviceRegistered_5 0.23 0.06 -0.01 -0.00 0.03 -0.01 0.04 0.00 0.02 0.09 ... 0.00 0.00 -0.01 0.01 -0.00 0.01 -0.00 -0.01 -0.01 0.02
NumberOfDeviceRegistered_6 0.15 0.08 -0.03 -0.03 0.03 0.01 0.02 0.01 0.04 0.06 ... -0.00 0.01 0.02 0.00 0.01 -0.03 -0.01 -0.02 0.01 0.01
PreferedOrderCat_Fashion -0.01 -0.01 0.12 0.02 0.06 -0.03 0.04 0.01 0.07 0.14 ... -0.32 -0.09 -0.00 0.00 -0.01 -0.01 0.02 0.00 0.03 -0.03
PreferedOrderCat_Grocery 0.00 -0.09 0.35 0.01 0.05 -0.05 0.07 0.01 -0.13 0.01 ... -0.21 -0.06 -0.00 0.02 -0.02 0.01 0.00 0.04 0.01 -0.04
PreferedOrderCat_Laptop & Accessory 0.01 -0.13 -0.02 -0.05 0.09 -0.02 0.04 -0.02 0.11 -0.01 ... -0.58 -0.17 0.02 -0.00 0.01 -0.03 -0.00 -0.01 0.03 -0.03
PreferedOrderCat_Mobile Phone -0.00 0.22 -0.37 0.01 -0.14 0.05 -0.14 0.01 0.13 -0.12 ... 1.00 -0.17 -0.01 -0.00 -0.00 0.02 -0.00 -0.02 -0.07 0.09
PreferedOrderCat_Others -0.01 -0.05 0.27 0.05 -0.05 0.03 0.05 -0.01 -0.49 0.05 ... -0.17 1.00 -0.01 -0.02 0.03 0.02 -0.03 0.02 0.03 -0.04
SatisfactionScore_1 0.14 -0.07 0.01 -0.02 -0.00 0.03 -0.04 0.04 0.03 -0.01 ... -0.01 -0.01 1.00 -0.17 -0.34 -0.25 -0.25 -0.21 0.14 0.02
SatisfactionScore_2 -0.20 -0.04 -0.00 0.01 -0.02 0.01 -0.01 0.00 0.01 0.01 ... -0.00 -0.02 -0.17 1.00 -0.22 -0.17 -0.17 0.27 -0.22 0.02
SatisfactionScore_3 -0.00 0.01 0.00 -0.04 0.01 -0.01 -0.01 -0.01 -0.03 -0.02 ... -0.00 0.03 -0.34 -0.22 1.00 -0.32 -0.33 0.02 0.01 -0.02
SatisfactionScore_4 0.01 0.00 0.01 0.01 0.00 -0.03 0.03 -0.04 -0.01 0.01 ... 0.02 0.02 -0.25 -0.17 -0.32 1.00 -0.24 -0.01 0.01 0.00
SatisfactionScore_5 0.01 0.09 -0.03 0.04 0.00 -0.01 0.03 0.01 0.01 0.02 ... -0.00 -0.03 -0.25 -0.17 -0.33 -0.24 1.00 -0.00 0.01 -0.01
MaritalStatus_Divorced -0.24 -0.02 0.04 0.03 -0.00 -0.01 -0.00 0.01 -0.03 0.03 ... -0.02 0.02 -0.21 0.27 0.02 -0.01 -0.00 1.00 -0.45 -0.29
MaritalStatus_Married 0.18 -0.15 0.08 -0.00 0.04 0.04 0.01 -0.00 -0.01 0.01 ... -0.07 0.03 0.14 -0.22 0.01 0.01 0.01 -0.45 1.00 -0.73
MaritalStatus_Single -0.01 0.18 -0.12 -0.02 -0.04 -0.03 -0.01 -0.00 0.04 -0.03 ... 0.09 -0.04 0.02 0.02 -0.02 0.00 -0.01 -0.29 -0.73 1.00

47 rows × 47 columns

In [47]:
import plotly.graph_objs as go
import plotly.offline as py
#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
In [48]:
#splitting dataset into train and test data
train,test = train_test_split(ecd_clean,test_size = .20 ,random_state = 0)

cols    = [i for i in ecd_clean.columns if i not in cust_id + Target]
X_train = train[cols]
Y_train = train[Target]
X_test  = test[cols]
Y_test  = test[Target]
#logistic Regression
# Create an instance of the LogisticRegression model
logistic_regression_model = LogisticRegression(random_state=0)

# Fit the model to the training data
logistic_regression_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("")
print("Precision:", precision)
print("")
print("Recall:", recall)
print("")
print("F1 Score:", f1)
print("")

# Generate a confusion matrix
confusion_matrix_result = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(confusion_matrix_result)
print("")

df_cm = pd.DataFrame(confusion_matrix_result, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10, 7))
sns.heatmap(df_cm, annot = True, fmt ='g', cmap='hot')
print("Test Data Accuracy: %.6f" %accuracy_score(Y_test, Y_pred))
Accuracy: 0.8809946714031972

Precision: 0.712

Recall: 0.47593582887700536

F1 Score: 0.5705128205128205

Confusion Matrix:
[[903  36]
 [ 98  89]]

Test Data Accuracy: 0.880995
No description has been provided for this image
In [83]:
# Generate a classification report
classification_report_result = classification_report(Y_test, Y_pred)
print("Classification Report for Logistic Regression:")
print(classification_report_result)
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       939
           1       0.71      0.48      0.57       187

    accuracy                           0.88      1126
   macro avg       0.81      0.72      0.75      1126
weighted avg       0.87      0.88      0.87      1126

Decision Tree Classifier and Random Forest Classifier

In [50]:
#Building our model using Decision Tree Classifer
X = ecd_clean.drop(['Churn'],axis=1)
y = ecd_clean['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)

deci_tree = DecisionTreeClassifier()
deci_tree.fit(X_train,y_train)
deci_tree_pred = deci_tree.predict(X_test)
recall_score(y_test, deci_tree_pred)

# Calculate and print accuracy
accuracy_dt = accuracy_score(y_test, deci_tree_pred)
precision_dt = precision_score(y_test, deci_tree_pred)
recall_dt = recall_score(y_test, deci_tree_pred)
print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)

#confusion matrix
conf_matrix = confusion_matrix(y_test, deci_tree_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
Accuracy: 0.9316163410301954
Precision: 0.7723214285714286
Recall: 0.8693467336683417
No description has been provided for this image
In [51]:
#classification report for decision tree
class_report = classification_report(y_test, deci_tree_pred)
print("Classification Report for Decision Tree:\n", class_report)
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.97      0.94      0.96       927
           1       0.77      0.87      0.82       199

    accuracy                           0.93      1126
   macro avg       0.87      0.91      0.89      1126
weighted avg       0.94      0.93      0.93      1126

In [52]:
#generating future importance
feature_importance1 = deci_tree.feature_importances_
feature_names2 = X.columns

# Create a bar plot to visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance1)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Decision Tree Feature Importance')
plt.show()
No description has been provided for this image
In [53]:
#building our model using Random Forest Classifer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)

rand_fcl = RandomForestClassifier(n_estimators=120)
rand_fcl.fit(X_train,y_train)

rand_fcl_pred = rand_fcl.predict(X_test)
recall_score(y_test, rand_fcl_pred)

# Evaluate the model's performance
accuracy_rfc = accuracy_score(y_test, rand_fcl_pred)
precision_rfc = precision_score(y_test, rand_fcl_pred)
recall_rfc = recall_score(y_test, rand_fcl_pred)

print("Accuracy:", accuracy_rfc)
print("Precision:", precision_rfc)
print("Recall:", recall_rfc)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, rand_fcl_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="gray", fmt="d")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Random Forest Classifier")
plt.show()
Accuracy: 0.9609236234458259
Precision: 0.9743589743589743
Recall: 0.7916666666666666
No description has been provided for this image
In [54]:
feature_importance2 = rand_fcl.feature_importances_
feature_names2 = X.columns

# Create a bar plot to visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance2)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Random Forest Feature Importance')
plt.show()
No description has been provided for this image
In [ ]: